#import ipmoratant libraries
#libraries to read and manipulate data
import pandas as pd
import numpy as np
#libraries used in visualization
import plotly.express as px
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
Requirement already satisfied: ipywidgets in e:\anaconda3\lib\site-packages (7.6.3) Requirement already satisfied: ipykernel>=4.5.1 in e:\anaconda3\lib\site-packages (from ipywidgets) (5.3.4) Requirement already satisfied: traitlets>=4.3.1 in e:\anaconda3\lib\site-packages (from ipywidgets) (5.0.5) Requirement already satisfied: jupyterlab-widgets>=1.0.0 in e:\anaconda3\lib\site-packages (from ipywidgets) (1.0.0) Requirement already satisfied: nbformat>=4.2.0 in e:\anaconda3\lib\site-packages (from ipywidgets) (5.1.3) Requirement already satisfied: widgetsnbextension~=3.5.0 in e:\anaconda3\lib\site-packages (from ipywidgets) (3.5.1) Requirement already satisfied: ipython>=4.0.0 in e:\anaconda3\lib\site-packages (from ipywidgets) (7.22.0) Requirement already satisfied: jupyter-client in e:\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1.12) Requirement already satisfied: tornado>=4.2 in e:\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1) Requirement already satisfied: pygments in e:\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets) (2.8.1) Requirement already satisfied: setuptools>=18.5 in e:\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets) (52.0.0.post20210125) Requirement already satisfied: colorama in e:\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets) (0.4.4) Requirement already satisfied: pickleshare in e:\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets) (0.7.5) Requirement already satisfied: jedi>=0.16 in e:\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets) (0.17.2) Requirement already satisfied: decorator in e:\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets) (5.0.6) Requirement already satisfied: backcall in e:\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets) (0.2.0) Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in e:\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets) (3.0.17) Requirement already satisfied: parso<0.8.0,>=0.7.0 in e:\anaconda3\lib\site-packages (from jedi>=0.16->ipython>=4.0.0->ipywidgets) (0.7.0) Requirement already satisfied: jupyter-core in e:\anaconda3\lib\site-packages (from nbformat>=4.2.0->ipywidgets) (4.7.1) Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in e:\anaconda3\lib\site-packages (from nbformat>=4.2.0->ipywidgets) (3.2.0) Requirement already satisfied: ipython-genutils in e:\anaconda3\lib\site-packages (from nbformat>=4.2.0->ipywidgets) (0.2.0) Requirement already satisfied: attrs>=17.4.0 in e:\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (20.3.0) Requirement already satisfied: six>=1.11.0 in e:\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (1.15.0) Requirement already satisfied: pyrsistent>=0.14.0 in e:\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (0.17.3) Requirement already satisfied: wcwidth in e:\anaconda3\lib\site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0->ipywidgets) (0.2.5) Requirement already satisfied: notebook>=4.4.1 in e:\anaconda3\lib\site-packages (from widgetsnbextension~=3.5.0->ipywidgets) (6.3.0) Requirement already satisfied: jinja2 in e:\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (2.11.3) Requirement already satisfied: argon2-cffi in e:\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (20.1.0) Requirement already satisfied: nbconvert in e:\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (6.0.7) Requirement already satisfied: terminado>=0.8.3 in e:\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.9.4) Requirement already satisfied: pyzmq>=17 in e:\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (20.0.0) Requirement already satisfied: prometheus-client in e:\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.10.1) Requirement already satisfied: Send2Trash>=1.5.0 in e:\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.5.0) Requirement already satisfied: python-dateutil>=2.1 in e:\anaconda3\lib\site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (2.8.1) Requirement already satisfied: pywin32>=1.0 in e:\anaconda3\lib\site-packages (from jupyter-core->nbformat>=4.2.0->ipywidgets) (227) Requirement already satisfied: pywinpty>=0.5 in e:\anaconda3\lib\site-packages (from terminado>=0.8.3->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.5.7) Requirement already satisfied: cffi>=1.0.0 in e:\anaconda3\lib\site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.14.5) Requirement already satisfied: pycparser in e:\anaconda3\lib\site-packages (from cffi>=1.0.0->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (2.20) Requirement already satisfied: MarkupSafe>=0.23 in e:\anaconda3\lib\site-packages (from jinja2->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.1.1) Requirement already satisfied: bleach in e:\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (3.3.0) Requirement already satisfied: testpath in e:\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.4.4) Requirement already satisfied: entrypoints>=0.2.2 in e:\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.3) Requirement already satisfied: jupyterlab-pygments in e:\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.1.2) Requirement already satisfied: pandocfilters>=1.4.1 in e:\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.4.3) Requirement already satisfied: defusedxml in e:\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.7.1) Requirement already satisfied: mistune<2,>=0.8.1 in e:\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.8.4) Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in e:\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.5.3) Requirement already satisfied: nest-asyncio in e:\anaconda3\lib\site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.5.1) Requirement already satisfied: async-generator in e:\anaconda3\lib\site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.10) Requirement already satisfied: packaging in e:\anaconda3\lib\site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (20.9) Requirement already satisfied: webencodings in e:\anaconda3\lib\site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.5.1) Requirement already satisfied: pyparsing>=2.0.2 in e:\anaconda3\lib\site-packages (from packaging->bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (2.4.7)
Enabling notebook extension jupyter-js-widgets/extension...
- Validating: ok
#read data
df = pd.read_csv('tmdb-movies.csv')
df
| id | imdb_id | popularity | budget | revenue | original_title | cast | homepage | director | tagline | ... | overview | runtime | genres | production_companies | release_date | vote_count | vote_average | release_year | budget_adj | revenue_adj | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 135397 | tt0369610 | 32.985763 | 150000000 | 1513528810 | Jurassic World | Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... | http://www.jurassicworld.com/ | Colin Trevorrow | The park is open. | ... | Twenty-two years after the events of Jurassic ... | 124 | Action|Adventure|Science Fiction|Thriller | Universal Studios|Amblin Entertainment|Legenda... | 6/9/15 | 5562 | 6.5 | 2015 | 1.379999e+08 | 1.392446e+09 |
| 1 | 76341 | tt1392190 | 28.419936 | 150000000 | 378436354 | Mad Max: Fury Road | Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic... | http://www.madmaxmovie.com/ | George Miller | What a Lovely Day. | ... | An apocalyptic story set in the furthest reach... | 120 | Action|Adventure|Science Fiction|Thriller | Village Roadshow Pictures|Kennedy Miller Produ... | 5/13/15 | 6185 | 7.1 | 2015 | 1.379999e+08 | 3.481613e+08 |
| 2 | 262500 | tt2908446 | 13.112507 | 110000000 | 295238201 | Insurgent | Shailene Woodley|Theo James|Kate Winslet|Ansel... | http://www.thedivergentseries.movie/#insurgent | Robert Schwentke | One Choice Can Destroy You | ... | Beatrice Prior must confront her inner demons ... | 119 | Adventure|Science Fiction|Thriller | Summit Entertainment|Mandeville Films|Red Wago... | 3/18/15 | 2480 | 6.3 | 2015 | 1.012000e+08 | 2.716190e+08 |
| 3 | 140607 | tt2488496 | 11.173104 | 200000000 | 2068178225 | Star Wars: The Force Awakens | Harrison Ford|Mark Hamill|Carrie Fisher|Adam D... | http://www.starwars.com/films/star-wars-episod... | J.J. Abrams | Every generation has a story. | ... | Thirty years after defeating the Galactic Empi... | 136 | Action|Adventure|Science Fiction|Fantasy | Lucasfilm|Truenorth Productions|Bad Robot | 12/15/15 | 5292 | 7.5 | 2015 | 1.839999e+08 | 1.902723e+09 |
| 4 | 168259 | tt2820852 | 9.335014 | 190000000 | 1506249360 | Furious 7 | Vin Diesel|Paul Walker|Jason Statham|Michelle ... | http://www.furious7.com/ | James Wan | Vengeance Hits Home | ... | Deckard Shaw seeks revenge against Dominic Tor... | 137 | Action|Crime|Thriller | Universal Pictures|Original Film|Media Rights ... | 4/1/15 | 2947 | 7.3 | 2015 | 1.747999e+08 | 1.385749e+09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10861 | 21 | tt0060371 | 0.080598 | 0 | 0 | The Endless Summer | Michael Hynson|Robert August|Lord 'Tally Ho' B... | NaN | Bruce Brown | NaN | ... | The Endless Summer, by Bruce Brown, is one of ... | 95 | Documentary | Bruce Brown Films | 6/15/66 | 11 | 7.4 | 1966 | 0.000000e+00 | 0.000000e+00 |
| 10862 | 20379 | tt0060472 | 0.065543 | 0 | 0 | Grand Prix | James Garner|Eva Marie Saint|Yves Montand|Tosh... | NaN | John Frankenheimer | Cinerama sweeps YOU into a drama of speed and ... | ... | Grand Prix driver Pete Aron is fired by his te... | 176 | Action|Adventure|Drama | Cherokee Productions|Joel Productions|Douglas ... | 12/21/66 | 20 | 5.7 | 1966 | 0.000000e+00 | 0.000000e+00 |
| 10863 | 39768 | tt0060161 | 0.065141 | 0 | 0 | Beregis Avtomobilya | Innokentiy Smoktunovskiy|Oleg Efremov|Georgi Z... | NaN | Eldar Ryazanov | NaN | ... | An insurance agent who moonlights as a carthie... | 94 | Mystery|Comedy | Mosfilm | 1/1/66 | 11 | 6.5 | 1966 | 0.000000e+00 | 0.000000e+00 |
| 10864 | 21449 | tt0061177 | 0.064317 | 0 | 0 | What's Up, Tiger Lily? | Tatsuya Mihashi|Akiko Wakabayashi|Mie Hama|Joh... | NaN | Woody Allen | WOODY ALLEN STRIKES BACK! | ... | In comic Woody Allen's film debut, he took the... | 80 | Action|Comedy | Benedict Pictures Corp. | 11/2/66 | 22 | 5.4 | 1966 | 0.000000e+00 | 0.000000e+00 |
| 10865 | 22293 | tt0060666 | 0.035919 | 19000 | 0 | Manos: The Hands of Fate | Harold P. Warren|Tom Neyman|John Reynolds|Dian... | NaN | Harold P. Warren | It's Shocking! It's Beyond Your Imagination! | ... | A family gets lost on the road and stumbles up... | 74 | Horror | Norm-Iris | 11/15/66 | 15 | 1.5 | 1966 | 1.276423e+05 | 0.000000e+00 |
10866 rows × 21 columns
# get information about data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10866 entries, 0 to 10865 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 10866 non-null int64 1 imdb_id 10856 non-null object 2 popularity 10866 non-null float64 3 budget 10866 non-null int64 4 revenue 10866 non-null int64 5 original_title 10866 non-null object 6 cast 10790 non-null object 7 homepage 2936 non-null object 8 director 10822 non-null object 9 tagline 8042 non-null object 10 keywords 9373 non-null object 11 overview 10862 non-null object 12 runtime 10866 non-null int64 13 genres 10843 non-null object 14 production_companies 9836 non-null object 15 release_date 10866 non-null object 16 vote_count 10866 non-null int64 17 vote_average 10866 non-null float64 18 release_year 10866 non-null int64 19 budget_adj 10866 non-null float64 20 revenue_adj 10866 non-null float64 dtypes: float64(4), int64(6), object(11) memory usage: 1.7+ MB
df.describe()
| id | popularity | budget | revenue | runtime | vote_count | vote_average | release_year | budget_adj | revenue_adj | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 10866.000000 | 10866.000000 | 1.086600e+04 | 1.086600e+04 | 10866.000000 | 10866.000000 | 10866.000000 | 10866.000000 | 1.086600e+04 | 1.086600e+04 |
| mean | 66064.177434 | 0.646441 | 1.462570e+07 | 3.982332e+07 | 102.070863 | 217.389748 | 5.974922 | 2001.322658 | 1.755104e+07 | 5.136436e+07 |
| std | 92130.136561 | 1.000185 | 3.091321e+07 | 1.170035e+08 | 31.381405 | 575.619058 | 0.935142 | 12.812941 | 3.430616e+07 | 1.446325e+08 |
| min | 5.000000 | 0.000065 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 10.000000 | 1.500000 | 1960.000000 | 0.000000e+00 | 0.000000e+00 |
| 25% | 10596.250000 | 0.207583 | 0.000000e+00 | 0.000000e+00 | 90.000000 | 17.000000 | 5.400000 | 1995.000000 | 0.000000e+00 | 0.000000e+00 |
| 50% | 20669.000000 | 0.383856 | 0.000000e+00 | 0.000000e+00 | 99.000000 | 38.000000 | 6.000000 | 2006.000000 | 0.000000e+00 | 0.000000e+00 |
| 75% | 75610.000000 | 0.713817 | 1.500000e+07 | 2.400000e+07 | 111.000000 | 145.750000 | 6.600000 | 2011.000000 | 2.085325e+07 | 3.369710e+07 |
| max | 417859.000000 | 32.985763 | 4.250000e+08 | 2.781506e+09 | 900.000000 | 9767.000000 | 9.200000 | 2015.000000 | 4.250000e+08 | 2.827124e+09 |
#check null values in data
df.isna().sum()
id 0 imdb_id 10 popularity 0 budget 0 revenue 0 original_title 0 cast 76 homepage 7930 director 44 tagline 2824 keywords 1493 overview 4 runtime 0 genres 23 production_companies 1030 release_date 0 vote_count 0 vote_average 0 release_year 0 budget_adj 0 revenue_adj 0 dtype: int64
# Check duplicated
df.duplicated().sum()
1
#dropping unuseful columns
df.drop(labels =['id', 'imdb_id', 'budget', 'revenue', 'cast', 'homepage', 'tagline', 'overview', 'release_date', 'vote_count', 'vote_average'],
axis=1, inplace=True)
df.shape
(10866, 10)
df.head()
| popularity | original_title | director | keywords | runtime | genres | production_companies | release_year | budget_adj | revenue_adj | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 32.985763 | Jurassic World | Colin Trevorrow | monster|dna|tyrannosaurus rex|velociraptor|island | 124 | Action|Adventure|Science Fiction|Thriller | Universal Studios|Amblin Entertainment|Legenda... | 2015 | 1.379999e+08 | 1.392446e+09 |
| 1 | 28.419936 | Mad Max: Fury Road | George Miller | future|chase|post-apocalyptic|dystopia|australia | 120 | Action|Adventure|Science Fiction|Thriller | Village Roadshow Pictures|Kennedy Miller Produ... | 2015 | 1.379999e+08 | 3.481613e+08 |
| 2 | 13.112507 | Insurgent | Robert Schwentke | based on novel|revolution|dystopia|sequel|dyst... | 119 | Adventure|Science Fiction|Thriller | Summit Entertainment|Mandeville Films|Red Wago... | 2015 | 1.012000e+08 | 2.716190e+08 |
| 3 | 11.173104 | Star Wars: The Force Awakens | J.J. Abrams | android|spaceship|jedi|space opera|3d | 136 | Action|Adventure|Science Fiction|Fantasy | Lucasfilm|Truenorth Productions|Bad Robot | 2015 | 1.839999e+08 | 1.902723e+09 |
| 4 | 9.335014 | Furious 7 | James Wan | car race|speed|revenge|suspense|car | 137 | Action|Crime|Thriller | Universal Pictures|Original Film|Media Rights ... | 2015 | 1.747999e+08 | 1.385749e+09 |
#dropping NaN rows
df.dropna(inplace=True)
df.isna().sum()
popularity 0 original_title 0 director 0 keywords 0 runtime 0 genres 0 production_companies 0 release_year 0 budget_adj 0 revenue_adj 0 dtype: int64
#dropping duplicated row
df.drop_duplicates(inplace=True)
df.duplicated().sum()
0
#dropping rows with zeros in badget or revenue
df['budget_adj'] = df['budget_adj'].replace(0, np.NaN)
df['revenue_adj'] = df['revenue_adj'].replace(0, np.NaN)
df.dropna(inplace=True)
df.isna().sum()
popularity 0 original_title 0 director 0 keywords 0 runtime 0 genres 0 production_companies 0 release_year 0 budget_adj 0 revenue_adj 0 dtype: int64
#plotting barplot for Average Budget over time
x=df.groupby('release_year').budget_adj.mean().index
y=df.groupby('release_year').budget_adj.mean().values
fig = px.line(x=x, y=y, labels={
'x': "Release Year", 'y':'Budget (Adj)'},
title="Average Spending over Time")
#show the plot
fig.show()
#making new column for profit
df['profit'] = df['revenue_adj'] - df['budget_adj']
#finding movies with highest and lowest profit
lowest = pd.DataFrame(df.loc[df['profit'].idxmin()])
highest = pd.DataFrame(df.loc[df['profit'].idxmax()])
high_low = pd.concat([highest, lowest],axis=1)
high_low
| 1329 | 2244 | |
|---|---|---|
| popularity | 12.037933 | 0.25054 |
| original_title | Star Wars | The Warrior's Way |
| director | George Lucas | Sngmoo Lee |
| keywords | android|galaxy|hermit|death star|lightsaber | assassin|small town|revenge|deception|super speed |
| runtime | 121 | 100 |
| genres | Adventure|Action|Science Fiction | Adventure|Fantasy|Action|Western|Thriller |
| production_companies | Lucasfilm|Twentieth Century Fox Film Corporation | Boram Entertainment Inc. |
| release_year | 1977 | 2010 |
| budget_adj | 39575591.358274 | 425000000.0 |
| revenue_adj | 2789712242.27745 | 11087569.0 |
| profit | 2750136650.919176 | -413912431.0 |
df_pf = pd.DataFrame(df['profit'].sort_values(ascending =False))
df_pf['original_title'] = df['original_title']
fig = px.bar(x=df_pf['original_title'][:10], y=df_pf['profit'][:10],
labels={'x': " ", 'y':'Profit'},
title="Top 10 Movies with highest profit")
#show the plot
fig.show()
#splitting genres
df_gen=df[['genres','release_year','popularity']].copy()
df_gen['genres'] = df_gen['genres'].str.split('|')
df_gen
| genres | release_year | popularity | |
|---|---|---|---|
| 0 | [Action, Adventure, Science Fiction, Thriller] | 2015 | 32.985763 |
| 1 | [Action, Adventure, Science Fiction, Thriller] | 2015 | 28.419936 |
| 2 | [Adventure, Science Fiction, Thriller] | 2015 | 13.112507 |
| 3 | [Action, Adventure, Science Fiction, Fantasy] | 2015 | 11.173104 |
| 4 | [Action, Crime, Thriller] | 2015 | 9.335014 |
| ... | ... | ... | ... |
| 10822 | [Drama] | 1966 | 0.670274 |
| 10828 | [Mystery, Thriller] | 1966 | 0.402730 |
| 10829 | [Action, Western] | 1966 | 0.395668 |
| 10835 | [Action, Adventure, Drama, War, Romance] | 1966 | 0.299911 |
| 10848 | [Adventure, Science Fiction] | 1966 | 0.207257 |
3679 rows × 3 columns
gen = df_gen.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
gen = df_gen.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
gen.name = 'genres'
df_gen.drop('genres',axis=1,inplace=True)
df_gen=df_gen.join(gen)
df_gen
| release_year | popularity | genres | |
|---|---|---|---|
| 0 | 2015 | 32.985763 | Action |
| 0 | 2015 | 32.985763 | Adventure |
| 0 | 2015 | 32.985763 | Science Fiction |
| 0 | 2015 | 32.985763 | Thriller |
| 1 | 2015 | 28.419936 | Action |
| ... | ... | ... | ... |
| 10835 | 1966 | 0.299911 | Drama |
| 10835 | 1966 | 0.299911 | War |
| 10835 | 1966 | 0.299911 | Romance |
| 10848 | 1966 | 0.207257 | Adventure |
| 10848 | 1966 | 0.207257 | Science Fiction |
9860 rows × 3 columns
x=df_gen['genres'].value_counts().index
y=df_gen['genres'].value_counts().values
fig = px.bar(x=x, y=y, labels={
'x': "", 'y':'Count'},
title="Genres Occurances")
fig.show()
y = df_gen['genres'].value_counts().values[13:]
sy = sum(y)
gen = df_gen['genres'].value_counts()
gen = gen.drop(labels = ['Music','History','War','Western','Documentary','Foreign','TV Movie'])
gen['Others']=sy
gen
Drama 1667 Comedy 1280 Thriller 1160 Action 1043 Adventure 725 Romance 628 Crime 628 Science Fiction 510 Horror 445 Family 403 Fantasy 385 Mystery 337 Animation 195 Others 454 Name: genres, dtype: int64
x=gen.index
y=gen.values
fig = px.pie(values=y, names=x,
title="Genres Percentages ")
fig.show()
df_gen
| release_year | popularity | genres | |
|---|---|---|---|
| 0 | 2015 | 32.985763 | Action |
| 0 | 2015 | 32.985763 | Adventure |
| 0 | 2015 | 32.985763 | Science Fiction |
| 0 | 2015 | 32.985763 | Thriller |
| 1 | 2015 | 28.419936 | Action |
| ... | ... | ... | ... |
| 10835 | 1966 | 0.299911 | Drama |
| 10835 | 1966 | 0.299911 | War |
| 10835 | 1966 | 0.299911 | Romance |
| 10848 | 1966 | 0.207257 | Adventure |
| 10848 | 1966 | 0.207257 | Science Fiction |
9860 rows × 3 columns
def f(year):
b = df_gen[df_gen["release_year"]==year].groupby('genres').popularity.sum().sort_values(ascending=False)
fig = px.bar(x=b.index, y=b.values,
labels={'y':'Popularity', 'x':'Genres'},
title= f'Most Popular genres in {year}')
fig.show()
interact(f, year=list(set(df_gen["release_year"])));
pip install wordcloud
Requirement already satisfied: wordcloud in e:\anaconda3\lib\site-packages (1.8.1) Requirement already satisfied: pillow in e:\anaconda3\lib\site-packages (from wordcloud) (8.2.0) Requirement already satisfied: numpy>=1.6.1 in e:\anaconda3\lib\site-packages (from wordcloud) (1.20.1) Requirement already satisfied: matplotlib in e:\anaconda3\lib\site-packages (from wordcloud) (3.3.4) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in e:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.4.7) Requirement already satisfied: python-dateutil>=2.1 in e:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.8.1) Requirement already satisfied: kiwisolver>=1.0.1 in e:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.3.1) Requirement already satisfied: cycler>=0.10 in e:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.10.0) Requirement already satisfied: six in e:\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib->wordcloud) (1.15.0) Note: you may need to restart the kernel to use updated packages.
from wordcloud import WordCloud, STOPWORDS
text = df.keywords.str.cat(sep='|')
wordcloud = WordCloud(width=1280, height=720,background_color="black").generate(text)
fig = px.imshow(wordcloud)
fig.show()
#splitting production companies
df_prod = df[['production_companies','profit']].copy()
df_prod['production_companies'] = df_prod['production_companies'].str.split('|')
prod = df_prod.apply(lambda x: pd.Series(x['production_companies']),axis=1).stack().reset_index(level=1, drop=True)
prod.name = 'production_companies'
df_prod.drop('production_companies',axis=1,inplace=True)
df_prod=df_prod.join(prod)
df_prod
| profit | production_companies | |
|---|---|---|
| 0 | 1.254446e+09 | Universal Studios |
| 0 | 1.254446e+09 | Amblin Entertainment |
| 0 | 1.254446e+09 | Legendary Pictures |
| 0 | 1.254446e+09 | Fuji Television Network |
| 0 | 1.254446e+09 | Dentsu |
| ... | ... | ... |
| 10829 | 9.049166e+06 | Laurel Productions |
| 10835 | 5.374412e+07 | Twentieth Century Fox Film Corporation |
| 10835 | 5.374412e+07 | Solar Productions |
| 10835 | 5.374412e+07 | Robert Wise Productions |
| 10848 | 4.625353e+07 | Twentieth Century Fox Film Corporation |
10229 rows × 2 columns
# plotting barplot for highest earning production companies
top_prod = df_prod.groupby('production_companies').profit.sum().sort_values(ascending=True)[-10:]
x=top_prod.values
y=top_prod.index
fig = px.bar(x=x, y=y,orientation='h',labels={'x':"Profit", 'y':''},title="10 Highest Earning Film Production Companies")
fig.show()
top_dir = df.groupby('director').popularity.sum().sort_values(ascending=True)[-10:]
y=top_dir.index
x=top_dir.values
fig = px.bar(x=x, y=y,orientation='h',labels={'x':"Popularity", 'y':''},title="Directors With Most Popular Movies")
fig.show()
x = df['budget_adj']
fig = px.box(df, x)
fig.show()
x = df['revenue_adj']
fig = px.box(df, x)
fig.show()
x=df['revenue_adj']
y=df['budget_adj']
fig = px.scatter(x=x, y=y, labels={
'x': "Revenue _adj", 'y':'Budget_adj'},
title="Revenue Vs Budget")
fig.show()
import plotly
import plotly.graph_objs as go
from datetime import datetime
# create figure
fig = go.Figure()
plot = go.Figure(data=[px.scatter(x=df['revenue_adj'],y=df['budget_adj']),
px.line( name='Bar Plot',x=df['revenue_adj'],y=df['budget_adj'])])
# Add dropdown
plot.update_layout(
updatemenus=[
dict(
type="buttons",
direction="left",
buttons=list([
dict(label="Sactter Plot",
method="update",
args=[{"visible": [True, False]},
{"title": "Sactter Plot",
}]),
dict(label="Bar Plot",
method="update",
args=[{"visible": [False, True]},
{"title": "Bar Plot",
}]),
]),
)
])
plot.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-35-347eedb081c9> in <module> 1 plot = go.Figure(data=[px.scatter(x=df['revenue_adj'],y=df['budget_adj']), ----> 2 px.line( name='Bar Plot',x=df['revenue_adj'],y=df['budget_adj'])]) 3 # Add dropdown 4 plot.update_layout( 5 updatemenus=[ TypeError: line() got an unexpected keyword argument 'name'
# Update plot sizing
x=df['revenue_adj']
y=df['budget_adj']
figo.add_trace(go.)
figo.update_layout(
width=800,
height=900,
autosize=False,
margin=dict(t=0, b=0, l=0, r=0),
template="plotly_white",
)
figo.update_layout(
updatemenus=[
dict(
type = "buttons",
direction = "left",
buttons=list([
dict(
args=["type", "line"],
label="Line Chart",
method="restyle"
),
dict(
args=["type", "scatter"],
label="Scatter Plot",
method="restyle"
)
]),
pad={"r": 10, "t": 10},
showactive=True,
x=0.11,
xanchor="left",
y=1.1,
yanchor="top"
),
]
)
figo.show()
x=df['revenue_adj']
y=df['popularity']
fig = px.scatter(x=x, y=y, labels={
'x': "Revenue _adj", 'y':'Popularity'},
title="Revenue Vs Popularity")
fig.show()
#correlation heatmap to find properties that associated with high revenues movies
corr = df[['revenue_adj','budget_adj','popularity','runtime']].corr()
fig = px.imshow(corr,color_continuous_scale=px.colors.sequential.Blues,
title = 'Revenue Vs (Budget,Runtime,Popularity)')
fig.show()
#plotting a histogram of runtime of movies
fig = px.histogram(x = df['runtime'],
labels={'x':'Runtime'},
title = 'Distribution of Runtime',
nbins=35)
fig.show()
#plotting a histogram of Budget of movies
fig = px.histogram(x = df['budget_adj'],
labels={'x':'Budget'},
title = 'Distribution of Budget',
nbins=35)
fig.show()
#plotting a histogram of Revenue of movies
fig = px.histogram(x = df['revenue_adj'],
labels={'x':'Revenue'},
title = 'Distribution of Revenue',
nbins=40)
fig.show()
Drama has the highest number of released movies followed by Comedy.
Most Earning Film Production Companies are:
Directors with the most popular movies:
Movies with higher Budgets and Popularity have shown an increase in the revenues.